Name – Prakhar Pratap Singh

Reg no - 21BCL0158

ASSIGNMNET - 4

DATA PREPROCESSING

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:

#Import the dataset.
df=pd.read_csv("Employee-atrition.csv")

In [6]:

df.head()

Out[6]:

Age

Attrition

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

41

Yes

Travel_Rarely

1102

Sales

1

2

Life Sciences

1

1

...

1

80

0

8

0

1

6

4

0

5

1

49

No

Travel_Frequently

279

Research & Development

8

1

Life Sciences

1

2

...

4

80

1

10

3

3

10

7

1

7

2

37

Yes

Travel_Rarely

1373

Research & Development

2

2

Other

1

4

...

2

80

0

7

3

3

0

0

0

0

3

33

No

Travel_Frequently

1392

Research & Development

3

4

Life Sciences

1

5

...

3

80

0

8

3

3

8

7

3

0

4

27

No

Travel_Rarely

591

Research & Development

2

1

Medical

1

7

...

4

80

1

6

3

3

2

2

2

2

5 rows × 35 columns

In [7]:

df.shape

Out[7]:

(1470, 35)

In [8]:

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB

In [9]:

df.describe()

Out[9]:

Age

DailyRate

DistanceFromHome

Education

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

HourlyRate

JobInvolvement

JobLevel

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

count

1470.000000

1470.000000

1470.000000

1470.000000

1470.0

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

...

1470.000000

1470.0

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

mean

36.923810

802.485714

9.192517

2.912925

1.0

1024.865306

2.721769

65.891156

2.729932

2.063946

...

2.712245

80.0

0.793878

11.279592

2.799320

2.761224

7.008163

4.229252

2.187755

4.123129

std

9.135373

403.509100

8.106864

1.024165

0.0

602.024335

1.093082

20.329428

0.711561

1.106940

...

1.081209

0.0

0.852077

7.780782

1.289271

0.706476

6.126525

3.623137

3.222430

3.568136

min

18.000000

102.000000

1.000000

1.000000

1.0

1.000000

1.000000

30.000000

1.000000

1.000000

...

1.000000

80.0

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

25%

30.000000

465.000000

2.000000

2.000000

1.0

491.250000

2.000000

48.000000

2.000000

1.000000

...

2.000000

80.0

0.000000

6.000000

2.000000

2.000000

3.000000

2.000000

0.000000

2.000000

50%

36.000000

802.000000

7.000000

3.000000

1.0

1020.500000

3.000000

66.000000

3.000000

2.000000

...

3.000000

80.0

1.000000

10.000000

3.000000

3.000000

5.000000

3.000000

1.000000

3.000000

75%

43.000000

1157.000000

14.000000

4.000000

1.0

1555.750000

4.000000

83.750000

3.000000

3.000000

...

4.000000

80.0

1.000000

15.000000

3.000000

3.000000

9.000000

7.000000

3.000000

7.000000

max

60.000000

1499.000000

29.000000

5.000000

1.0

2068.000000

4.000000

100.000000

4.000000

5.000000

...

4.000000

80.0

3.000000

40.000000

6.000000

4.000000

40.000000

18.000000

15.000000

17.000000

8 rows × 26 columns

In [10]:

df.describe()

Out[10]:

Age

DailyRate

DistanceFromHome

Education

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

HourlyRate

JobInvolvement

JobLevel

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

count

1470.000000

1470.000000

1470.000000

1470.000000

1470.0

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

...

1470.000000

1470.0

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

1470.000000

mean

36.923810

802.485714

9.192517

2.912925

1.0

1024.865306

2.721769

65.891156

2.729932

2.063946

...

2.712245

80.0

0.793878

11.279592

2.799320

2.761224

7.008163

4.229252

2.187755

4.123129

std

9.135373

403.509100

8.106864

1.024165

0.0

602.024335

1.093082

20.329428

0.711561

1.106940

...

1.081209

0.0

0.852077

7.780782

1.289271

0.706476

6.126525

3.623137

3.222430

3.568136

min

18.000000

102.000000

1.000000

1.000000

1.0

1.000000

1.000000

30.000000

1.000000

1.000000

...

1.000000

80.0

0.000000

0.000000

0.000000

1.000000

0.000000

0.000000

0.000000

0.000000

25%

30.000000

465.000000

2.000000

2.000000

1.0

491.250000

2.000000

48.000000

2.000000

1.000000

...

2.000000

80.0

0.000000

6.000000

2.000000

2.000000

3.000000

2.000000

0.000000

2.000000

50%

36.000000

802.000000

7.000000

3.000000

1.0

1020.500000

3.000000

66.000000

3.000000

2.000000

...

3.000000

80.0

1.000000

10.000000

3.000000

3.000000

5.000000

3.000000

1.000000

3.000000

75%

43.000000

1157.000000

14.000000

4.000000

1.0

1555.750000

4.000000

83.750000

3.000000

3.000000

...

4.000000

80.0

1.000000

15.000000

3.000000

3.000000

9.000000

7.000000

3.000000

7.000000

max

60.000000

1499.000000

29.000000

5.000000

1.0

2068.000000

4.000000

100.000000

4.000000

5.000000

...

4.000000

80.0

3.000000

40.000000

6.000000

4.000000

40.000000

18.000000

15.000000

17.000000

8 rows × 26 columns

In [11]:

df.Attrition.value_counts()

Out[11]:

No     1233
Yes     237
Name: Attrition, dtype: int64

In [ ]:

#Check for null values

In [12]:

df.isnull().sum()

Out[12]:

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [13]:

sns.displot(df['Age'])

Out[13]:

<seaborn.axisgrid.FacetGrid at 0x15c45bb36d0>

In [14]:

df.corr()

Out[14]:

Age

DailyRate

DistanceFromHome

Education

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

HourlyRate

JobInvolvement

JobLevel

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

Age

1.000000

0.010661

-0.001686

0.208034

NaN

-0.010145

0.010146

0.024287

0.029820

0.509604

...

0.053535

NaN

0.037510

0.680381

-0.019621

-0.021490

0.311309

0.212901

0.216513

0.202089

DailyRate

0.010661

1.000000

-0.004985

-0.016806

NaN

-0.050990

0.018355

0.023381

0.046135

0.002966

...

0.007846

NaN

0.042143

0.014515

0.002453

-0.037848

-0.034055

0.009932

-0.033229

-0.026363

DistanceFromHome

-0.001686

-0.004985

1.000000

0.021042

NaN

0.032916

-0.016075

0.031131

0.008783

0.005303

...

0.006557

NaN

0.044872

0.004628

-0.036942

-0.026556

0.009508

0.018845

0.010029

0.014406

Education

0.208034

-0.016806

0.021042

1.000000

NaN

0.042070

-0.027128

0.016775

0.042438

0.101589

...

-0.009118

NaN

0.018422

0.148280

-0.025100

0.009819

0.069114

0.060236

0.054254

0.069065

EmployeeCount

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

...

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

EmployeeNumber

-0.010145

-0.050990

0.032916

0.042070

NaN

1.000000

0.017621

0.035179

-0.006888

-0.018519

...

-0.069861

NaN

0.062227

-0.014365

0.023603

0.010309

-0.011240

-0.008416

-0.009019

-0.009197

EnvironmentSatisfaction

0.010146

0.018355

-0.016075

-0.027128

NaN

0.017621

1.000000

-0.049857

-0.008278

0.001212

...

0.007665

NaN

0.003432

-0.002693

-0.019359

0.027627

0.001458

0.018007

0.016194

-0.004999

HourlyRate

0.024287

0.023381

0.031131

0.016775

NaN

0.035179

-0.049857

1.000000

0.042861

-0.027853

...

0.001330

NaN

0.050263

-0.002334

-0.008548

-0.004607

-0.019582

-0.024106

-0.026716

-0.020123

JobInvolvement

0.029820

0.046135

0.008783

0.042438

NaN

-0.006888

-0.008278

0.042861

1.000000

-0.012630

...

0.034297

NaN

0.021523

-0.005533

-0.015338

-0.014617

-0.021355

0.008717

-0.024184

0.025976

JobLevel

0.509604

0.002966

0.005303

0.101589

NaN

-0.018519

0.001212

-0.027853

-0.012630

1.000000

...

0.021642

NaN

0.013984

0.782208

-0.018191

0.037818

0.534739

0.389447

0.353885

0.375281

JobSatisfaction

-0.004892

0.030571

-0.003669

-0.011296

NaN

-0.046247

-0.006784

-0.071335

-0.021476

-0.001944

...

-0.012454

NaN

0.010690

-0.020185

-0.005779

-0.019459

-0.003803

-0.002305

-0.018214

-0.027656

MonthlyIncome

0.497855

0.007707

-0.017014

0.094961

NaN

-0.014829

-0.006259

-0.015794

-0.015271

0.950300

...

0.025873

NaN

0.005408

0.772893

-0.021736

0.030683

0.514285

0.363818

0.344978

0.344079

MonthlyRate

0.028051

-0.032182

0.027473

-0.026084

NaN

0.012648

0.037600

-0.015297

-0.016322

0.039563

...

-0.004085

NaN

-0.034323

0.026442

0.001467

0.007963

-0.023655

-0.012815

0.001567

-0.036746

NumCompaniesWorked

0.299635

0.038153

-0.029251

0.126317

NaN

-0.001251

0.012594

0.022157

0.015012

0.142501

...

0.052733

NaN

0.030075

0.237639

-0.066054

-0.008366

-0.118421

-0.090754

-0.036814

-0.110319

PercentSalaryHike

0.003634

0.022704

0.040235

-0.011111

NaN

-0.012944

-0.031701

-0.009062

-0.017205

-0.034730

...

-0.040490

NaN

0.007528

-0.020608

-0.005221

-0.003280

-0.035991

-0.001520

-0.022154

-0.011985

PerformanceRating

0.001904

0.000473

0.027110

-0.024539

NaN

-0.020359

-0.029548

-0.002172

-0.029071

-0.021222

...

-0.031351

NaN

0.003506

0.006744

-0.015579

0.002572

0.003435

0.034986

0.017896

0.022827

RelationshipSatisfaction

0.053535

0.007846

0.006557

-0.009118

NaN

-0.069861

0.007665

0.001330

0.034297

0.021642

...

1.000000

NaN

-0.045952

0.024054

0.002497

0.019604

0.019367

-0.015123

0.033493

-0.000867

StandardHours

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

...

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

NaN

StockOptionLevel

0.037510

0.042143

0.044872

0.018422

NaN

0.062227

0.003432

0.050263

0.021523

0.013984

...

-0.045952

NaN

1.000000

0.010136

0.011274

0.004129

0.015058

0.050818

0.014352

0.024698

TotalWorkingYears

0.680381

0.014515

0.004628

0.148280

NaN

-0.014365

-0.002693

-0.002334

-0.005533

0.782208

...

0.024054

NaN

0.010136

1.000000

-0.035662

0.001008

0.628133

0.460365

0.404858

0.459188

TrainingTimesLastYear

-0.019621

0.002453

-0.036942

-0.025100

NaN

0.023603

-0.019359

-0.008548

-0.015338

-0.018191

...

0.002497

NaN

0.011274

-0.035662

1.000000

0.028072

0.003569

-0.005738

-0.002067

-0.004096

WorkLifeBalance

-0.021490

-0.037848

-0.026556

0.009819

NaN

0.010309

0.027627

-0.004607

-0.014617

0.037818

...

0.019604

NaN

0.004129

0.001008

0.028072

1.000000

0.012089

0.049856

0.008941

0.002759

YearsAtCompany

0.311309

-0.034055

0.009508

0.069114

NaN

-0.011240

0.001458

-0.019582

-0.021355

0.534739

...

0.019367

NaN

0.015058

0.628133

0.003569

0.012089

1.000000

0.758754

0.618409

0.769212

YearsInCurrentRole

0.212901

0.009932

0.018845

0.060236

NaN

-0.008416

0.018007

-0.024106

0.008717

0.389447

...

-0.015123

NaN

0.050818

0.460365

-0.005738

0.049856

0.758754

1.000000

0.548056

0.714365

YearsSinceLastPromotion

0.216513

-0.033229

0.010029

0.054254

NaN

-0.009019

0.016194

-0.026716

-0.024184

0.353885

...

0.033493

NaN

0.014352

0.404858

-0.002067

0.008941

0.618409

0.548056

1.000000

0.510224

YearsWithCurrManager

0.202089

-0.026363

0.014406

0.069065

NaN

-0.009197

-0.004999

-0.020123

0.025976

0.375281

...

-0.000867

NaN

0.024698

0.459188

-0.004096

0.002759

0.769212

0.714365

0.510224

1.000000

26 rows × 26 columns

In [15]:

sns.boxplot(df.Age)
C:\Users\mahes\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

Out[15]:

<AxesSubplot:xlabel='Age'>

In [16]:

df.head()

Out[16]:

Age

Attrition

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

41

Yes

Travel_Rarely

1102

Sales

1

2

Life Sciences

1

1

...

1

80

0

8

0

1

6

4

0

5

1

49

No

Travel_Frequently

279

Research & Development

8

1

Life Sciences

1

2

...

4

80

1

10

3

3

10

7

1

7

2

37

Yes

Travel_Rarely

1373

Research & Development

2

2

Other

1

4

...

2

80

0

7

3

3

0

0

0

0

3

33

No

Travel_Frequently

1392

Research & Development

3

4

Life Sciences

1

5

...

3

80

0

8

3

3

8

7

3

0

4

27

No

Travel_Rarely

591

Research & Development

2

1

Medical

1

7

...

4

80

1

6

3

3

2

2

2

2

5 rows × 35 columns

In [17]:

df

Out[17]:

Age

Attrition

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

41

Yes

Travel_Rarely

1102

Sales

1

2

Life Sciences

1

1

...

1

80

0

8

0

1

6

4

0

5

1

49

No

Travel_Frequently

279

Research & Development

8

1

Life Sciences

1

2

...

4

80

1

10

3

3

10

7

1

7

2

37

Yes

Travel_Rarely

1373

Research & Development

2

2

Other

1

4

...

2

80

0

7

3

3

0

0

0

0

3

33

No

Travel_Frequently

1392

Research & Development

3

4

Life Sciences

1

5

...

3

80

0

8

3

3

8

7

3

0

4

27

No

Travel_Rarely

591

Research & Development

2

1

Medical

1

7

...

4

80

1

6

3

3

2

2

2

2

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

1465

36

No

Travel_Frequently

884

Research & Development

23

2

Medical

1

2061

...

3

80

1

17

3

3

5

2

0

3

1466

39

No

Travel_Rarely

613

Research & Development

6

1

Medical

1

2062

...

1

80

1

9

5

3

7

7

1

7

1467

27

No

Travel_Rarely

155

Research & Development

4

3

Life Sciences

1

2064

...

2

80

1

6

0

3

6

2

0

3

1468

49

No

Travel_Frequently

1023

Sales

2

3

Medical

1

2065

...

4

80

0

17

3

2

9

6

0

8

1469

34

No

Travel_Rarely

628

Research & Development

8

3

Medical

1

2068

...

1

80

0

6

3

4

4

3

1

2

1470 rows × 35 columns

In [ ]:

#Splitting Dependent and Independent variables
#Dependant - Attrition
#Independant - All others

In [20]:

x = df.drop(columns= ['Attrition'], axis=1)
x.head()

Out[20]:

Age

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

41

Travel_Rarely

1102

Sales

1

2

Life Sciences

1

1

2

...

1

80

0

8

0

1

6

4

0

5

1

49

Travel_Frequently

279

Research & Development

8

1

Life Sciences

1

2

3

...

4

80

1

10

3

3

10

7

1

7

2

37

Travel_Rarely

1373

Research & Development

2

2

Other

1

4

4

...

2

80

0

7

3

3

0

0

0

0

3

33

Travel_Frequently

1392

Research & Development

3

4

Life Sciences

1

5

4

...

3

80

0

8

3

3

8

7

3

0

4

27

Travel_Rarely

591

Research & Development

2

1

Medical

1

7

1

...

4

80

1

6

3

3

2

2

2

2

5 rows × 34 columns

In [21]:

y = df.Attrition
y.head

Out[21]:

<bound method NDFrame.head of 0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object>

In [ ]:

#Label Encoding

In [22]:

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x.BusinessTravel=le.fit_transform(x.BusinessTravel)
x.Department=le.fit_transform(x.Department)
x.EducationField=le.fit_transform(x.EducationField)
x.Gender=le.fit_transform(x.Gender)
x.JobRole=le.fit_transform(x.JobRole)
x.MaritalStatus=le.fit_transform(x.MaritalStatus)
x.Over18=le.fit_transform(x.Over18)
x.OverTime=le.fit_transform(x.OverTime)
 
x.head()

Out[22]:

Age

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

41

2

1102

2

1

2

1

1

1

2

...

1

80

0

8

0

1

6

4

0

5

1

49

1

279

1

8

1

1

1

2

3

...

4

80

1

10

3

3

10

7

1

7

2

37

2

1373

1

2

2

4

1

4

4

...

2

80

0

7

3

3

0

0

0

0

3

33

1

1392

1

3

4

1

1

5

4

...

3

80

0

8

3

3

8

7

3

0

4

27

2

591

1

2

1

3

1

7

1

...

4

80

1

6

3

3

2

2

2

2

5 rows × 34 columns

In [ ]:

#Feature Scaling using MINMAX

In [23]:

from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
x_Scaled= pd.DataFrame(ms.fit_transform(x),columns=x.columns)
x_Scaled

Out[23]:

Age

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

0

0.547619

1.0

0.715820

1.0

0.000000

0.25

0.2

0.0

0.000000

0.333333

...

0.000000

0.0

0.000000

0.200

0.000000

0.000000

0.150

0.222222

0.000000

0.294118

1

0.738095

0.5

0.126700

0.5

0.250000

0.00

0.2

0.0

0.000484

0.666667

...

1.000000

0.0

0.333333

0.250

0.500000

0.666667

0.250

0.388889

0.066667

0.411765

2

0.452381

1.0

0.909807

0.5

0.035714

0.25

0.8

0.0

0.001451

1.000000

...

0.333333

0.0

0.000000

0.175

0.500000

0.666667

0.000

0.000000

0.000000

0.000000

3

0.357143

0.5

0.923407

0.5

0.071429

0.75

0.2

0.0

0.001935

1.000000

...

0.666667

0.0

0.000000

0.200

0.500000

0.666667

0.200

0.388889

0.200000

0.000000

4

0.214286

1.0

0.350036

0.5

0.035714

0.00

0.6

0.0

0.002903

0.000000

...

1.000000

0.0

0.333333

0.150

0.500000

0.666667

0.050

0.111111

0.133333

0.117647

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

1465

0.428571

0.5

0.559771

0.5

0.785714

0.25

0.6

0.0

0.996613

0.666667

...

0.666667

0.0

0.333333

0.425

0.500000

0.666667

0.125

0.111111

0.000000

0.176471

1466

0.500000

1.0

0.365784

0.5

0.178571

0.00

0.6

0.0

0.997097

1.000000

...

0.000000

0.0

0.333333

0.225

0.833333

0.666667

0.175

0.388889

0.066667

0.411765

1467

0.214286

1.0

0.037938

0.5

0.107143

0.50

0.2

0.0

0.998065

0.333333

...

0.333333

0.0

0.333333

0.150

0.000000

0.666667

0.150

0.111111

0.000000

0.176471

1468

0.738095

0.5

0.659270

1.0

0.035714

0.50

0.6

0.0

0.998549

1.000000

...

1.000000

0.0

0.000000

0.425

0.500000

0.333333

0.225

0.333333

0.000000

0.470588

1469

0.380952

1.0

0.376521

0.5

0.250000

0.50

0.6

0.0

1.000000

0.333333

...

0.000000

0.0

0.000000

0.150

0.500000

1.000000

0.100

0.166667

0.066667

0.117647

1470 rows × 34 columns

In [ ]:

#Splitting Data into Train and Test.

In [25]:

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_Scaled,y,test_size=0.2,random_state=0)

In [26]:

x_train.shape,x_test.shape,y_train.shape,y_test.shape

Out[26]:

((1176, 34), (294, 34), (1176,), (294,))

In [28]:

x_train.head()

Out[28]:

Age

BusinessTravel

DailyRate

Department

DistanceFromHome

Education

EducationField

EmployeeCount

EmployeeNumber

EnvironmentSatisfaction

...

RelationshipSatisfaction

StandardHours

StockOptionLevel

TotalWorkingYears

TrainingTimesLastYear

WorkLifeBalance

YearsAtCompany

YearsInCurrentRole

YearsSinceLastPromotion

YearsWithCurrManager

1374

0.952381

1.0

0.360057

1.0

0.714286

0.50

0.2

0.0

0.937107

1.000000

...

0.666667

0.0

0.333333

0.725

0.333333

0.333333

0.025

0.000000

0.000000

0.000000

1092

0.642857

1.0

0.607015

0.5

0.964286

0.50

1.0

0.0

0.747460

1.000000

...

1.000000

0.0

0.333333

0.200

0.500000

0.666667

0.125

0.222222

0.000000

0.176471

768

0.523810

1.0

0.141732

1.0

0.892857

0.50

0.4

0.0

0.515239

0.666667

...

0.333333

0.0

0.333333

0.200

0.500000

0.333333

0.175

0.388889

0.466667

0.294118

569

0.428571

0.0

0.953472

1.0

0.250000

0.75

0.2

0.0

0.381229

0.000000

...

0.333333

0.0

0.000000

0.250

0.166667

0.666667

0.250

0.388889

0.000000

0.529412

911

0.166667

0.5

0.355762

1.0

0.821429

0.00

0.2

0.0

0.615385

0.666667

...

1.000000

0.0

0.000000

0.025

0.666667

0.666667

0.025

0.000000

0.066667

0.000000

5 rows × 34 columns

MODEL BUILDING

LOGISTIC REGRESSION

In [29]:

from sklearn.linear_model import LogisticRegression
modellr=LogisticRegression()

In [30]:

modellr.fit(x_train,y_train)

Out[30]:

LogisticRegression()

In [31]:

pred=modellr.predict(x_test)
pred

Out[31]:

array(['No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [32]:

y_test

Out[32]:

442      No
1091     No
981     Yes
785      No
1332    Yes
       ... 
1439     No
481      No
124     Yes
198      No
1229     No
Name: Attrition, Length: 294, dtype: object

In [ ]:

#Evaluation of Classification Model

In [33]:

#Accuracy score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve

In [34]:

accuracy_score(y_test,pred)

Out[34]:

0.8843537414965986

In [35]:

confusion_matrix(y_test,pred)

Out[35]:

array([[242,   3],
       [ 31,  18]], dtype=int64)

In [36]:

pd.crosstab(y_test,pred)

Out[36]:

col_0

No

Yes

Attrition

No

242

3

Yes

31

18

In [ ]:

#Performance Metrics:

In [37]:

print(classification_report(y_test,pred))
              precision    recall  f1-score   support
 
          No       0.89      0.99      0.93       245
         Yes       0.86      0.37      0.51        49
 
    accuracy                           0.88       294
   macro avg       0.87      0.68      0.72       294
weighted avg       0.88      0.88      0.86       294
 

In [38]:

probability=modellr.predict_proba(x_test)[:,1]
probability

Out[38]:

array([0.16000127, 0.20600667, 0.31532384, 0.09242886, 0.63667551,
       0.06153061, 0.61819432, 0.0757087 , 0.00841372, 0.3912069 ,
       0.05398439, 0.33293123, 0.02020698, 0.67215483, 0.19786547,
       0.03454902, 0.11043981, 0.17101703, 0.04477777, 0.22783614,
       0.2335018 , 0.01553905, 0.06464492, 0.05029956, 0.58792413,
       0.44849464, 0.07412714, 0.04460935, 0.67666632, 0.0584383 ,
       0.01599026, 0.03521098, 0.06963085, 0.17397462, 0.07830857,
       0.04288032, 0.08150424, 0.07106342, 0.03622137, 0.05223965,
       0.04862098, 0.02091497, 0.01819361, 0.01362467, 0.02873997,
       0.50236969, 0.41553218, 0.00306874, 0.73976412, 0.51382382,
       0.09637213, 0.48845516, 0.08036228, 0.25757243, 0.66516772,
       0.26308027, 0.01964858, 0.30198497, 0.02919946, 0.16038964,
       0.02102747, 0.21670232, 0.13981568, 0.0358316 , 0.37208403,
       0.03002317, 0.29091186, 0.16041142, 0.10437497, 0.08695177,
       0.08217589, 0.30984518, 0.08531362, 0.07420689, 0.12268651,
       0.06192552, 0.04640904, 0.07624712, 0.19738483, 0.03236316,
       0.00884439, 0.0244108 , 0.13635803, 0.0260104 , 0.03341008,
       0.08186888, 0.00499397, 0.03474852, 0.03858027, 0.14602694,
       0.26167665, 0.16667357, 0.27400109, 0.24159565, 0.02160421,
       0.17748606, 0.34076078, 0.28022482, 0.06914126, 0.05003806,
       0.24437761, 0.74698271, 0.35438567, 0.01920627, 0.08778845,
       0.03255847, 0.05461351, 0.15123251, 0.06843702, 0.13752637,
       0.09584388, 0.04669882, 0.02493091, 0.15383171, 0.07081259,
       0.03089296, 0.0537667 , 0.11554316, 0.00881616, 0.01263271,
       0.17552253, 0.05045234, 0.08823238, 0.82995757, 0.03017756,
       0.0236819 , 0.0087012 , 0.1349589 , 0.16474801, 0.05202613,
       0.01524549, 0.29278083, 0.54767448, 0.34275448, 0.04629541,
       0.38966344, 0.61333366, 0.14552367, 0.07402366, 0.24143471,
       0.09418418, 0.0689069 , 0.10061956, 0.19346327, 0.20026293,
       0.03004939, 0.14900424, 0.00348846, 0.11225149, 0.15843155,
       0.06047573, 0.18601882, 0.06085869, 0.12221317, 0.03280184,
       0.02738799, 0.06356425, 0.08302382, 0.01541716, 0.014665  ,
       0.38517822, 0.01264231, 0.14961974, 0.80508787, 0.11598661,
       0.2842811 , 0.17020143, 0.1530583 , 0.02764153, 0.00613226,
       0.04191632, 0.09782393, 0.11551417, 0.10377982, 0.01779313,
       0.14371315, 0.10615435, 0.10298963, 0.05132621, 0.09061081,
       0.02897383, 0.09924087, 0.00512032, 0.75108423, 0.04296968,
       0.04062134, 0.37518972, 0.04563128, 0.7251816 , 0.10671665,
       0.36949086, 0.38146941, 0.32095493, 0.05266802, 0.08172004,
       0.13947833, 0.04334317, 0.01469593, 0.26413988, 0.06330966,
       0.1614747 , 0.15380517, 0.67152357, 0.05840793, 0.27891823,
       0.04512564, 0.46033865, 0.00348431, 0.14068967, 0.02747401,
       0.12714133, 0.17284246, 0.07341066, 0.10099827, 0.16870885,
       0.02560842, 0.01824031, 0.08670796, 0.02834237, 0.13710215,
       0.08778935, 0.2200061 , 0.73401148, 0.15938978, 0.4095449 ,
       0.01513845, 0.11306309, 0.21497506, 0.32337575, 0.03409266,
       0.04256318, 0.32157531, 0.05454465, 0.02348479, 0.16423352,
       0.32696147, 0.22892063, 0.00877159, 0.08198819, 0.01156361,
       0.1408691 , 0.29235147, 0.01270305, 0.17329916, 0.04081391,
       0.04094165, 0.42771425, 0.34958286, 0.03766772, 0.12025286,
       0.37698923, 0.3192629 , 0.79559338, 0.05385659, 0.21597037,
       0.06383728, 0.00570991, 0.66018187, 0.35855286, 0.37783606,
       0.36781398, 0.03554512, 0.21718203, 0.05943622, 0.06554485,
       0.10081475, 0.00818713, 0.26591316, 0.42809675, 0.06542835,
       0.09296803, 0.01259826, 0.14226651, 0.05072662, 0.02372258,
       0.02586923, 0.06760427, 0.24315648, 0.26961432, 0.19831733,
       0.2652296 , 0.0165923 , 0.15784236, 0.08398982, 0.02711775,
       0.18750547, 0.00783535, 0.2844239 , 0.00270742, 0.02484969,
       0.22585745, 0.72775605, 0.07691547, 0.26304359])

In [ ]:

 

In [ ]:

DECISION TREE

In [ ]:

 

In [39]:

from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)

Out[39]:

DecisionTreeClassifier()

In [40]:

pred=dtc.predict(x_test)
pred

Out[40]:

array(['No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No'], dtype=object)

In [41]:

y_test

Out[41]:

442      No
1091     No
981     Yes
785      No
1332    Yes
       ... 
1439     No
481      No
124     Yes
198      No
1229     No
Name: Attrition, Length: 294, dtype: object

In [ ]:

#Evaluation of Classification Model

In [42]:

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
accuracy_score(y_test,pred)

Out[42]:

0.7414965986394558

In [43]:

confusion_matrix(y_test,pred)

Out[43]:

array([[204,  41],
       [ 35,  14]], dtype=int64)

In [44]:

pd.crosstab(y_test,pred)

Out[44]:

col_0

No

Yes

Attrition

No

204

41

Yes

35

14

In [45]:

#Performance Metrics:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support
 
          No       0.85      0.83      0.84       245
         Yes       0.25      0.29      0.27        49
 
    accuracy                           0.74       294
   macro avg       0.55      0.56      0.56       294
weighted avg       0.75      0.74      0.75       294
 

In [ ]:

 

In [ ]:

RANDOM FOREST

In [ ]:

 

In [52]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rfc=RandomForestClassifier()

In [53]:

forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]

In [54]:

rfc_cv= GridSearchCV(rfc,param_grid=forest_params,cv=10,scoring="accuracy")

In [58]:

rfc_cv.fit(x_train,y_train)
C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
50 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
 
Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "C:\Users\mahes\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 185, in _parallel_build_trees
    tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\mahes\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]
 
  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\mahes\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [       nan 0.8460959  0.85203535 0.85969868 0.85627988 0.85712734
 0.85796755 0.86224105 0.8596842  0.85797479 0.86136462 0.85796031
 0.85881501 0.85543242        nan 0.84610314 0.85630161 0.85459221
 0.86054614 0.85967695 0.85883674 0.85883674 0.86221932 0.86393597
 0.85799652 0.8605389  0.85795306 0.86309576        nan 0.85290453
 0.85459945 0.85458496 0.85373026 0.85969868 0.86222657 0.85798928
 0.86223381 0.85713458 0.86563813 0.86222657 0.85796755 0.85797479
        nan 0.85289729 0.85544691 0.85970592 0.85883674 0.85969144
 0.85883674 0.8596842  0.85712734 0.85711285 0.85969868 0.85880776
 0.85797479 0.86051717        nan 0.84948573 0.8571708  0.85883674
 0.85457772 0.86137911 0.85798928 0.86650732 0.86308127 0.86053165
 0.86051717 0.85796755 0.86052441 0.85711285]
  warnings.warn(

Out[58]:

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [10, 11, 12, 13, 14],
                          'max_features': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                           12, 13]}],
             scoring='accuracy')

In [59]:

pred=rfc_cv.predict(x_test)
print(classification_report(y_test,pred))
              precision    recall  f1-score   support
 
          No       0.85      0.98      0.91       245
         Yes       0.67      0.16      0.26        49
 
    accuracy                           0.85       294
   macro avg       0.76      0.57      0.59       294
weighted avg       0.82      0.85      0.81       294
 

In [60]:

rfc_cv.best_params_

Out[60]:

{'max_depth': 14, 'max_features': 7}

In [ ]:

 

In [ ]:

 

In [ ]: